Project Checkpoint 4

NAMES: Madeline Abbott, Aidan Teppema, Daisy Cho





Progress Made:

<<<<<<< HEAD

We began our exploratory analysis of the data by creating plots of potential variables of interest (such as hurricane frequency, category frequency, maximum wind speed, NAO index, and CO2 levels) over time. We looked for trends in these variables over time–trends that we could potentially use in our model…

======= <<<<<<< HEAD

We began our exploratory analysis of the data by creating plots of potential variables of interest (such as hurricane frequency, category frequency, maximum wind speed, NAO index, and CO2 levels) over time. We looked for trends in these variables over time–trends that we could potentially use in our model. This visuals help exlain the variables and we plan to adapt many of these plots to include them in our final report. Maybe even in a shiny ap!

=======

We began our exploratory analysis of the data by creating plots of potential variables of interest (such as hurricane frequency, category frequency, maximum wind speed, NAO index) over time. We looked for trends in these variables over time–trends that we could potentially use in our model…

>>>>>>> 2cf57e92ab75bb271078cf20c211637c4172c6fc >>>>>>> 48722bde6780321fa48cb715ace959574fced67a

Group Member Roles:

Data

Hurricanes

hurricane_data <- read_csv("Historical_Tropical_Storm_Tracks.csv")
## Parsed with column specification:
## cols(
##   FID = col_integer(),
##   YEAR = col_integer(),
##   MONTH = col_integer(),
##   DAY = col_integer(),
##   AD_TIME = col_character(),
##   BTID = col_integer(),
##   NAME = col_character(),
##   LAT = col_double(),
##   LONG = col_double(),
##   WIND_KTS = col_integer(),
##   PRESSURE = col_integer(),
##   CAT = col_character(),
##   BASIN = col_character(),
##   Shape_Leng = col_double()
## )
hurricane_data<-na.omit(hurricane_data)
hurricane_data<-subset(hurricane_data, select=-c(BTID, AD_TIME))

#transform(hurricane_data, char = as.numeric(LAT))
#transform(hurricane_data, char = as.numeric(LONG))

hurricane_data$LAT <- as.numeric(hurricane_data$LAT)
hurricane_data$LONG <- as.numeric(hurricane_data$LONG)

dim(hurricane_data)
## [1] 59228    12
names(hurricane_data)
##  [1] "FID"        "YEAR"       "MONTH"      "DAY"        "NAME"      
##  [6] "LAT"        "LONG"       "WIND_KTS"   "PRESSURE"   "CAT"       
## [11] "BASIN"      "Shape_Leng"
head(hurricane_data)
## # A tibble: 6 x 12
##     FID  YEAR MONTH   DAY     NAME   LAT   LONG WIND_KTS PRESSURE   CAT
##   <int> <int> <int> <int>    <chr> <dbl>  <dbl>    <int>    <int> <chr>
## 1  2001  1957     8     8 NOTNAMED  22.5 -140.0       50        0    TS
## 2  2002  1961    10     3  PAULINE  22.1 -140.2       45        0    TS
## 3  2003  1962     8    29        C  18.0 -140.0       45        0    TS
## 4  2004  1967     7    14   DENISE  16.6 -139.5       45        0    TS
## 5  2005  1972     8    16    DIANA  18.5 -139.8       70        0    H1
## 6  2006  1976     7    22    DIANA  18.6 -139.8       30        0    TD
## # ... with 2 more variables: BASIN <chr>, Shape_Leng <dbl>
summary(hurricane_data)
##       FID             YEAR          MONTH             DAY       
##  Min.   :    1   Min.   :1851   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.:14808   1st Qu.:1928   1st Qu.: 8.000   1st Qu.: 8.00  
##  Median :29614   Median :1970   Median : 9.000   Median :16.00  
##  Mean   :29614   Mean   :1957   Mean   : 8.541   Mean   :15.87  
##  3rd Qu.:44421   3rd Qu.:1991   3rd Qu.: 9.000   3rd Qu.:23.00  
##  Max.   :59228   Max.   :2008   Max.   :12.000   Max.   :31.00  
##      NAME                LAT             LONG           WIND_KTS     
##  Length:59228       Min.   : 4.20   Min.   :-180.0   Min.   : 10.00  
##  Class :character   1st Qu.:16.10   1st Qu.:-108.5   1st Qu.: 35.00  
##  Mode  :character   Median :21.20   Median : -81.2   Median : 50.00  
##                     Mean   :23.53   Mean   : -83.2   Mean   : 54.73  
##                     3rd Qu.:29.60   3rd Qu.: -62.2   3rd Qu.: 70.00  
##                     Max.   :69.00   Max.   : 180.0   Max.   :165.00  
##     PRESSURE          CAT               BASIN             Shape_Leng     
##  Min.   :   0.0   Length:59228       Length:59228       Min.   : 0.0000  
##  1st Qu.:   0.0   Class :character   Class :character   1st Qu.: 0.7071  
##  Median :   0.0   Mode  :character   Mode  :character   Median : 1.0296  
##  Mean   : 372.3                                         Mean   : 1.2020  
##  3rd Qu.: 990.0                                         3rd Qu.: 1.4318  
##  Max.   :1024.0                                         Max.   :11.1803

Exploratory analysis

# select only the hurricanes after 1950 (this is when the naming system started)
hurricanes_recent <- hurricane_data %>%
  filter(YEAR > 1949)

Some plots…

# Plotting hurricane denisty by location

get_density <- function(x, y, n = 100) {
  dens <- MASS::kde2d(x = x, y = y, n = n)
  ix <- findInterval(x, dens$x)
  iy <- findInterval(y, dens$y)
  ii <- cbind(ix, iy)
  return(dens$z[ii])
}

hurricanes_recent$density <- get_density(hurricanes_recent$LONG, hurricanes_recent$LAT)
# creating a sample data.frame with your lat/lon points
hurricanes_recent$LAT <- as.numeric(hurricanes_recent$LAT)
hurricanes_recent$LONG <- as.numeric(hurricanes_recent$LONG)
#transform(hurricanes_recent, char = as.numeric(LAT))
#transform(hurricanes_recent, char = as.numeric(LONG))

# getting the map
map_hurricanes <- get_map(location = c(lon = mean(hurricanes_recent$LONG), lat = mean(hurricanes_recent$LAT)), zoom = 2, maptype = "terrain", scale = 2)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=21.893561,-89.486682&zoom=2&size=640x640&scale=2&maptype=terrain&language=en-EN
# plotting the map with some points on it
ggmap(map_hurricanes) +
  geom_point(data = hurricanes_recent, aes(x = LONG, y = LAT, alpha = 0.1, color = density)) + scale_color_viridis() + guides(size=FALSE)
## Warning: Removed 421 rows containing missing values (geom_point).

# Total number of hurricanes per year
hurricanes_per_year <- group_by(hurricanes_recent, YEAR) %>%
  summarize(length(unique(NAME)))
colnames(hurricanes_per_year)[2] <- "TOTAL_HURRICANES"
ggplot(hurricanes_per_year, aes(x = YEAR, y = TOTAL_HURRICANES)) + geom_point()

# Total number of hurricanes of each category per year
hurricanes_categories <- group_by(hurricanes_recent, YEAR) %>%
  count(CAT)
colnames(hurricanes_categories)[3] <- "TOTAL_HURRICANES"
ggplot(hurricanes_categories, aes(x = YEAR, y = TOTAL_HURRICANES, color = CAT, fill = CAT)) + geom_bar(stat = "identity")

ggplot(hurricanes_categories, aes(x = YEAR, y = TOTAL_HURRICANES, color = CAT, fill = CAT)) + geom_bar(stat = "identity", position = position_dodge(width = 0.9))

# Average pressure by year
hurricane_pressure <- group_by(hurricanes_recent, YEAR) %>%
  summarize(mean(PRESSURE))
colnames(hurricane_pressure)[2] <- 'AVG_PRESSURE'
colnames(hurricane_pressure)[1] <- "YEAR"
ggplot(hurricane_pressure, aes(x = YEAR, y = AVG_PRESSURE)) + geom_point() + ylab("Average pressure")

# Average max wind speed by year
hurricanes_wind <- group_by(hurricanes_recent, YEAR) %>%
  summarize(mean(WIND_KTS))
colnames(hurricanes_wind)[2] <- "AVG_WIND"
ggplot(hurricanes_wind, aes(x = YEAR, y = AVG_WIND)) + geom_point() + ylab("Average wind speed (kts)")

# If you want to save a plot...
#ggsave("hurri_categories2.pdf", plot = last_plot(), height = 5, width = 10)

Temperature

temp_data <- read.csv("ZonAnn.csv")

temp_data[!complete.cases(temp_data),]
##  [1] Year     Glob     NHem     SHem     X24N.90N X24S.24N X90S.24S
##  [8] X64N.90N X44N.64N X24N.44N EQU.24N  X24S.EQU X44S.24S X64S.44S
## [15] X90S.64S
## <0 rows> (or 0-length row.names)
temp_data<-na.omit(temp_data)
temp_data<-temp_data[,-(5:15),drop=FALSE]
temp_recent <- temp_data %>%
  filter(Year > 1949)
dim(temp_recent)
## [1] 67  4
names(temp_recent)
## [1] "Year" "Glob" "NHem" "SHem"
head(temp_recent)
##   Year  Glob  NHem  SHem
## 1 1950 -0.18 -0.18 -0.20
## 2 1951 -0.06  0.04 -0.17
## 3 1952  0.01  0.05 -0.03
## 4 1953  0.07  0.23 -0.08
## 5 1954 -0.14 -0.04 -0.25
## 6 1955 -0.14 -0.11 -0.20
summary(temp_recent)
##       Year           Glob              NHem              SHem        
##  Min.   :1950   Min.   :-0.2000   Min.   :-0.2600   Min.   :-0.2500  
##  1st Qu.:1966   1st Qu.: 0.0200   1st Qu.: 0.0250   1st Qu.:-0.0250  
##  Median :1983   Median : 0.1800   Median : 0.1700   Median : 0.2600  
##  Mean   :1983   Mean   : 0.2536   Mean   : 0.2901   Mean   : 0.2161  
##  3rd Qu.:2000   3rd Qu.: 0.4950   3rd Qu.: 0.6150   3rd Qu.: 0.4150  
##  Max.   :2016   Max.   : 0.9900   Max.   : 1.2800   Max.   : 0.7000
# plot of Global, Northern Hemisphere, Southern Hemisphere deviation from corresponding mean by year
ggplot(temp_recent,aes(x=Year))+
  geom_line(aes(y=Glob, color="Global"))+
  geom_line(aes(y=NHem, color="Northern Hemisphere"))+
  geom_line(aes(y=SHem, color="Southern Hemisphere"))+
<<<<<<< HEAD
  ylab("Temperature Anomolies")+
  geom_smooth(method='lm', aes(Year,Glob), colour="red", size=0.5, se=FALSE)+
  geom_smooth(method='lm', aes(Year,NHem),colour="green", size=0.5, se=FALSE)+
  geom_smooth(method='lm', aes(Year,SHem), colour="blue", size=0.5, se=FALSE)

CO2

CO2 <- read_csv("~/Desktop/Bayes Capstone/CO2.csv")
## Parsed with column specification:
## cols(
##   year = col_integer(),
##   month = col_integer(),
##   decimal = col_double(),
##   average = col_double(),
##   trend = col_double()
## )
# Alter CO2 to remove unnecessary variable trend
CO2 <- subset(CO2, select = -trend)

# Create subset of CO2 data from year 2000
# to show only fluxuations in CO2 levels throughout one year
CO2_2000 <- subset(CO2, year == 2000)
head(CO2_2000)
## # A tibble: 6 x 4
##    year month  decimal average
##   <int> <int>    <dbl>   <dbl>
## 1  2000     1 2000.042  369.21
## 2  2000     2 2000.125  369.46
## 3  2000     3 2000.208  369.78
## 4  2000     4 2000.292  370.18
## 5  2000     5 2000.375  370.08
## 6  2000     6 2000.458  369.17
# Plot this example
plot(CO2_2000$average ~ CO2_2000$month)

# Create time series to show variation over months and years
Average <- ts(CO2$average, frequency = 12, start = 1980)

# Plot and add axis labels
v1 <- c(1980,1985,1990,1995,2000,2005,2010,2015)
v2 <- c(1980,1985,1990,1995,2000,2005,2010,2015)
plot(Average)
axis(side = 1, 
     at = v1, 
     labels = v2,
     tck=-.05)

# Create colored time series 
CO2$Date <- as.yearmon(paste(CO2$year, CO2$month), "%Y %m")

# Plot this time series
ggplot(CO2, aes(Date, Average)) + geom_line( aes(colour = year))
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

# Add trend line
ggplot(CO2, aes(Date, Average)) + geom_line( aes(colour = year)) + geom_smooth()
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## `geom_smooth()` using method = 'loess'

# Show increasing CO2 over the years by month
ggplot(CO2, aes(x = month, y = average, color = year)) + geom_point()

======= ylab("Temperature Anomolies") <<<<<<< HEAD

#ggplot(hurricanes_categories, aes(x = YEAR, y = TOTAL_HURRICANES, color = CAT, fill = CAT)) + 
  #geom_bar(stat = "identity") +
  #geom_abline(data=temp_data, aes(x=Year, y=Glob))
# Show increasing CO2 over the years by month
library(ggplot2)
library(lubridate)

CO2 <- read_csv("~/Desktop/Bayes Capstone/CO2.csv")
## Parsed with column specification:
## cols(
##   year = col_integer(),
##   month = col_integer(),
##   decimal = col_double(),
##   average = col_double(),
##   trend = col_double()
## )
ggplot(CO2, aes(x = month, y = average, color = year)) + geom_point()

CO2

# Alter CO2 to remove unnecessary variable trend
CO2 <- subset(CO2, select = -trend)
# Create subset of CO2 data from year 2000
# to show only fluxuations in CO2 levels throughout one year
CO2_2000 <- subset(CO2, year == 2000)
head(CO2_2000)
## # A tibble: 6 x 4
##    year month  decimal average
##   <int> <int>    <dbl>   <dbl>
## 1  2000     1 2000.042  369.21
## 2  2000     2 2000.125  369.46
## 3  2000     3 2000.208  369.78
## 4  2000     4 2000.292  370.18
## 5  2000     5 2000.375  370.08
## 6  2000     6 2000.458  369.17
# Plot this example
plot(CO2_2000$average ~ CO2_2000$month)

# Create time series to show variation over months and years
Average <- ts(CO2$average, frequency = 12, start = 1980)


# Plot and add axis labels
v1 <- c(1980,1985,1990,1995,2000,2005,2010,2015)
v2 <- c(1980,1985,1990,1995,2000,2005,2010,2015)
plot(Average)
axis(side = 1, 
     at = v1, 
     labels = v2,
     tck=-.05)

# Create colored time series 
CO2$Date <- as.yearmon(paste(CO2$year, CO2$month), "%Y %m")

ggplot(CO2, aes(Date, Average)) + geom_line(aes(colour = year))
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

# Show increasing CO2 over the years by month
ggplot(CO2, aes(x = month, y = average, color = year)) + geom_point()

=======

>>>>>>> 2cf57e92ab75bb271078cf20c211637c4172c6fc >>>>>>> 48722bde6780321fa48cb715ace959574fced67a

NAO Index Data

NAO_index1 <- read_csv("NAO_index_monthly.csv", 
    col_types = cols(Apr = col_number(), 
        Aug = col_number(), Dec = col_number(), 
        Feb = col_number(), Jan = col_number(), 
        Jul = col_number(), Jun = col_number(), 
        Mar = col_number(), May = col_number(), 
        Nov = col_number(), Oct = col_number(), 
        Sep = col_number()))
## Warning: Missing column names filled in: 'X1' [1]
NAO_index2 <- NAO_index1[, -c(14)]

NAO_index3 <- NAO_index2 %>%
  rename(year = X1) %>%
  mutate(Jul = replace(Jul, Jul < -50| Jul > 50, "")) %>%
  mutate(Aug = replace(Aug, Aug < -50| Aug > 50, "")) %>%
  mutate(Sep = replace(Sep, Sep < -50| Sep > 50, "")) %>%
  mutate(Oct = replace(Oct, Oct < -50| Oct > 50, "")) %>%
  mutate(Nov = replace(Nov, Nov < -50| Nov > 50, "")) %>%
  mutate(Dec = replace(Dec, Dec < -50| Dec > 50, ""))
  

NAO_index <- NAO_index3[-nrow(NAO_index3),] 

dim(NAO_index)
## [1] 153  13
names(NAO_index)
##  [1] "year" "Jan"  "Feb"  "Mar"  "Apr"  "May"  "Jun"  "Jul"  "Aug"  "Sep" 
## [11] "Oct"  "Nov"  "Dec"
head(NAO_index)
## # A tibble: 6 x 13
##    year   Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
##   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1  1865  -0.6  -1.2   0.2  -0.2  -0.4   0.0   0.5   1.5   1.8    -2  -0.9
## 2  1866   0.5   0.8  -0.6  -2.3  -2.0   0.9  -0.5  -0.2   2.4  -0.3  -0.5
## 3  1867  -3.5   1.1  -4.3   1.8  -4.2   0.1    -2   1.9   1.4   2.2  -3.5
## 4  1868   0.7   3.0   3.6   1.7   2.3   3.1   0.4   1.5  -2.8   3.5  -1.8
## 5  1869   1.0   2.5   0.2  -0.2  -2.7  -1.9  -0.3    -1  -0.4  -1.4   1.4
## 6  1870   0.4  -2.8  -2.0   1.1   1.5   0.3   0.5  -3.6   0.2   0.9  -2.1
## # ... with 1 more variables: Dec <chr>
summary(NAO_index)
##      year                Jan               Feb               Mar         
##  Length:153         Min.   :-5.9000   Min.   :-4.9000   Min.   :-4.3000  
##  Class :character   1st Qu.:-0.9000   1st Qu.:-1.0000   1st Qu.:-1.2000  
##  Mode  :character   Median : 0.5000   Median : 0.5000   Median : 0.3000  
##                     Mean   : 0.1052   Mean   : 0.1641   Mean   : 0.1719  
##                     3rd Qu.: 1.4000   3rd Qu.: 1.4000   3rd Qu.: 1.4000  
##                     Max.   : 3.5000   Max.   : 3.7000   Max.   : 4.4000  
##       Apr                May                Jun             Jul           
##  Min.   :-3.30000   Min.   :-4.20000   Min.   :-4.000   Length:153        
##  1st Qu.:-1.30000   1st Qu.:-1.20000   1st Qu.:-1.400   Class :character  
##  Median :-0.10000   Median :-0.20000   Median :-0.100   Mode  :character  
##  Mean   : 0.01307   Mean   :-0.02353   Mean   :-0.132                     
##  3rd Qu.: 1.20000   3rd Qu.: 1.20000   3rd Qu.: 1.000                     
##  Max.   : 4.10000   Max.   : 5.20000   Max.   : 3.500                     
##      Aug                Sep                Oct           
##  Length:153         Length:153         Length:153        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      Nov                Dec           
##  Length:153         Length:153        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
# remove 2017 (data for year is incomplete)
NAO_index <- head(NAO_index, -1) 

# reshape table
NAO_index <- melt(NAO_index, id=c("year"))
colnames(NAO_index)[2] <- "month"
colnames(NAO_index)[3] <- "index"

NAO_index$year <- as.factor(NAO_index$year)
NAO_index$index <- as.numeric(NAO_index$index)

# group by month
NAO_avgMonth <- aggregate(NAO_index[, 3], list(NAO_index$month), mean)

# plot
ggplot(data = NAO_avgMonth, aes(x = Group.1, y = x)) + geom_point(color = "blue") + xlab("month") + ylab("NAO index, averaged across all years")